// FSRCNNX_x2_8-0-4-1_LineArt
// 移植自 https://github.com/igv/FSRCNN-TensorFlow

//!MAGPIE EFFECT
//!VERSION 4


//!TEXTURE
Texture2D INPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH * 2
//!HEIGHT INPUT_HEIGHT * 2
Texture2D OUTPUT;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D featureMap1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D featureMap2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex1;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex2;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex3;

//!TEXTURE
//!WIDTH INPUT_WIDTH
//!HEIGHT INPUT_HEIGHT
//!FORMAT R16G16B16A16_FLOAT
Texture2D tex4;

//!SAMPLER
//!FILTER POINT
SamplerState sam;

//!SAMPLER
//!FILTER LINEAR
SamplerState sam1;

//!PASS 1
//!DESC feature map
//!IN INPUT
//!OUT featureMap1, featureMap2
//!BLOCK_SIZE 32, 24
//!NUM_THREADS 128

#define SH_PIXELS_X  (MP_BLOCK_WIDTH + 4)
#define SH_PIXELS_Y  (MP_BLOCK_HEIGHT + 4)

groupshared float shPixelsY[SH_PIXELS_Y][SH_PIXELS_X];

float GetLuma(float3 rgb) {
	return dot(float3(0.299f, 0.587f, 0.114f), rgb);
}

void Pass1(uint2 blockStart, uint3 threadId) {
	float2 inputPt = GetInputPt();
	uint2 inputSize = GetInputSize();
	uint i;

	for (i = threadId.x * 2; i < SH_PIXELS_X * SH_PIXELS_Y / 2; i += MP_NUM_THREADS_X * 2) {
		uint2 pos = uint2(i % SH_PIXELS_X, i / SH_PIXELS_X * 2);
		const float2 tpos = (blockStart + pos - 1.5f) * inputPt;

		const float4 sr = INPUT.GatherRed(sam, tpos);
		const float4 sg = INPUT.GatherGreen(sam, tpos);
		const float4 sb = INPUT.GatherBlue(sam, tpos);

		shPixelsY[pos.y][pos.x] = GetLuma(float3(sr.w, sg.w, sb.w));
		shPixelsY[pos.y][pos.x + 1] = GetLuma(float3(sr.z, sg.z, sb.z));
		shPixelsY[pos.y + 1][pos.x] = GetLuma(float3(sr.x, sg.x, sb.x));
		shPixelsY[pos.y + 1][pos.x + 1] = GetLuma(float3(sr.y, sg.y, sb.y));
	}

	GroupMemoryBarrierWithGroupSync();

	for (i = threadId.x; i < MP_BLOCK_WIDTH * MP_BLOCK_HEIGHT; i += MP_NUM_THREADS_X) {
		const uint2 pos = uint2(i % MP_BLOCK_WIDTH, i / MP_BLOCK_WIDTH);
		const uint2 destPos = blockStart + pos;

		if (destPos.x >= inputSize.x || destPos.y >= inputSize.y) {
			continue;
		}

		float src[5][5];
		[unroll]
		for (int i = 0; i < 5; ++i) {
			[unroll]
			for (int j = 0; j < 5; ++j) {
				src[j][i] = shPixelsY[pos.y + i][pos.x + j];
			}
		}

		float4 target1 = float4(-0.3117050230503082, 0.1817725896835327, 0.0011673698900267, -0.0044658286496997);
		target1 += float4(-0.0187959559261799, -0.0206312909722328, 0.0226501729339361, 0.0111862262710929) * src[0][0];
		target1 += float4(0.0469042696058750, 0.0428658165037632, -0.0208927169442177, -0.0053485808894038) * src[0][1];
		target1 += float4(0.0486242026090622, 0.0268428903073072, -0.1095351055264473, -0.0197027549147606) * src[0][2];
		target1 += float4(-0.0301427692174911, -0.0444439016282558, 0.0803908482193947, -0.0072240661829710) * src[0][3];
		target1 += float4(0.0097448397427797, 0.0132117131724954, -0.0087575586512685, 0.0003270092420280) * src[0][4];
		target1 += float4(0.0227436870336533, 0.0284603293985128, -0.0899902656674385, 0.0174379274249077) * src[1][0];
		target1 += float4(-0.0880827009677887, -0.0890802741050720, 0.3386772871017456, -0.0749290063977242) * src[1][1];
		target1 += float4(-0.0832799598574638, -0.1518420130014420, 0.1693033277988434, 0.1514045447111130) * src[1][2];
		target1 += float4(0.0490957386791706, 0.0839962288737297, 0.0323486365377903, -0.0491475425660610) * src[1][3];
		target1 += float4(0.0281097982078791, 0.0267692077904940, -0.0460123419761658, 0.0137899341061711) * src[1][4];
		target1 += float4(0.0592067055404186, -0.0008030450553633, 0.1280025541782379, -0.0270480886101723) * src[2][0];
		target1 += float4(-0.0784756019711494, -0.0078630214557052, -0.1963789612054825, 0.2132134586572647) * src[2][1];
		target1 += float4(0.9478371739387512, -0.7432878613471985, -0.4691794812679291, -0.4196422100067139) * src[2][2];
		target1 += float4(0.1578149050474167, -0.0874812081456184, 0.1223142221570015, 0.2514914274215698) * src[2][3];
		target1 += float4(0.0576529577374458, 0.0775778889656067, 0.0526014007627964, -0.1151828765869141) * src[2][4];
		target1 += float4(-0.0459806136786938, -0.0550342053174973, -0.0553226508200169, -0.0042642662301660) * src[3][0];
		target1 += float4(0.1346504986286163, 0.1795998811721802, -0.0741422399878502, -0.0004661275597755) * src[3][1];
		target1 += float4(-0.0344312079250813, -0.0998986735939980, 0.2834288179874420, 0.1789152175188065) * src[3][2];
		target1 += float4(-0.0376542955636978, -0.0137260686606169, -0.2183600962162018, -0.0829529240727425) * src[3][3];
		target1 += float4(0.0143303163349628, 0.0085790483281016, 0.0312815308570862, 0.0557830408215523) * src[3][4];
		target1 += float4(0.0196402054280043, 0.0245775021612644, 0.0333996489644051, 0.0064323167316616) * src[4][0];
		target1 += float4(-0.0247105974704027, -0.0139399459585547, 0.0039188005030155, 0.0138866743072867) * src[4][1];
		target1 += float4(0.0688862130045891, 0.0629303157329559, -0.0323157459497452, -0.1300792843103409) * src[4][2];
		target1 += float4(0.0111092608422041, 0.0116711426526308, 0.0460555553436279, 0.0563828162848949) * src[4][3];
		target1 += float4(-0.0043270774185658, -0.0096766958013177, -0.0235258601605892, -0.0409700050950050) * src[4][4];

		float4 target2 = float4(0.0165165197104216, 0.0061719734221697, -0.0008248710073531, -0.0774794667959213);
		target2 += float4(-0.0127812735736370, -0.0146999256685376, 0.0025963818188757, 0.0008133125957102) * src[0][0];
		target2 += float4(0.0192508958280087, 0.0089628640562296, 0.0046624913811684, -0.0005601323791780) * src[0][1];
		target2 += float4(-0.1021092385053635, -0.0491660982370377, -0.0818324312567711, -0.0719010531902313) * src[0][2];
		target2 += float4(0.0166876111179590, -0.0046075899153948, 0.0258100070059299, -0.0235325042158365) * src[0][3];
		target2 += float4(-0.0028500237967819, -0.0020616643596441, -0.0073093594983220, -0.0034190006554127) * src[0][4];
		target2 += float4(0.0024815262295306, 0.0222324915230274, -0.0080765523016453, 0.0105959763750434) * src[1][0];
		target2 += float4(0.1017390340566635, 0.0138921840116382, 0.0559288635849953, -0.0168517548590899) * src[1][1];
		target2 += float4(0.1267367750406265, -0.2365809977054596, 0.4724994897842407, -0.0154752098023891) * src[1][2];
		target2 += float4(0.0847241580486298, 0.1127829849720001, -0.0643212646245956, 0.0177757386118174) * src[1][3];
		target2 += float4(-0.0354492329061031, -0.0234994646161795, 0.0336676724255085, 0.0153558924794197) * src[1][4];
		target2 += float4(-0.1001686528325081, 0.0175829399377108, -0.0146998856216669, -0.0897502079606056) * src[2][0];
		target2 += float4(0.0973328053951263, -0.5987607836723328, -0.0770601108670235, 0.2343221157789230) * src[2][1];
		target2 += float4(-1.0639246702194214, 0.5335622429847717, -0.2365868240594864, 0.6484431028366089) * src[2][2];
		target2 += float4(-0.0258918590843678, 0.1439655423164368, 0.2597847878932953, -0.5380389094352722) * src[2][3];
		target2 += float4(0.0333042629063129, -0.0408495217561722, 0.0026879014912993, 0.0496195442974567) * src[2][4];
		target2 += float4(0.0017764334334061, 0.0032939016819000, -0.0121603077277541, -0.0066827093251050) * src[3][0];
		target2 += float4(0.0497846752405167, 0.0766935721039772, 0.0505562871694565, 0.0058483541943133) * src[3][1];
		target2 += float4(0.6903248429298401, 0.0658241882920265, -0.4562527537345886, -0.0117225451394916) * src[3][2];
		target2 += float4(0.1896255612373352, -0.0459045991301537, -0.0380226671695709, -0.0333303771913052) * src[3][3];
		target2 += float4(-0.0868696048855782, 0.0157926902174950, 0.0011628456413746, 0.0207170285284519) * src[3][4];
		target2 += float4(0.0130701754242182, -0.0067251212894917, -0.0007082104566507, -0.0017002354143187) * src[4][0];
		target2 += float4(0.0029672298114747, -0.0060487915761769, 0.0191176552325487, 0.0520425662398338) * src[4][1];
		target2 += float4(-0.0253955777734518, -0.0159530192613602, 0.0304108783602715, -0.0263646803796291) * src[4][2];
		target2 += float4(-0.0708072409033775, 0.0109798992052674, 0.0285820439457893, 0.0188453849405050) * src[4][3];
		target2 += float4(0.0698847994208336, -0.0164128411561251, 0.0043246182613075, -0.0244176983833313) * src[4][4];

		featureMap1[destPos] = target1;
		featureMap2[destPos] = target2;
	}
}


//!PASS 2
//!DESC mapping 1
//!IN featureMap1, featureMap2
//!OUT tex1, tex2
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass2(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = featureMap1.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = featureMap1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = featureMap1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = featureMap1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = featureMap1.SampleLevel(sam, pos, 0);
	float4 bc1 = featureMap1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = featureMap1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = featureMap1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = featureMap1.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = featureMap2.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = featureMap2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = featureMap2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = featureMap2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = featureMap2.SampleLevel(sam, pos, 0);
	float4 bc2 = featureMap2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = featureMap2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = featureMap2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = featureMap2.SampleLevel(sam, pos + inputPt, 0);

	float4 target1 = float4(-0.0031195033807307, -0.0977938771247864, 0.0337169878184795, 0.0840695425868034);
	target1 += mul(tl1, float4x4(0.0028950418345630, 0.2153117954730988, -0.1120878234505653, 0.1065240651369095, -0.0902118757367134, 0.2227627933025360, -0.1268638819456100, 0.0378417931497097, 0.0262128096073866, 0.1100647151470184, -0.0224360711872578, -0.2487984448671341, 0.3278627693653107, 0.3930607438087463, -0.3361104130744934, -0.2318559885025024));
	target1 += mul(tl2, float4x4(-0.1388952732086182, -0.0210590325295925, -0.0107318097725511, 0.1340505480766296, -0.2403931617736816, 0.4324082732200623, -0.1229069381952286, -0.1129430904984474, -0.2194076776504517, -0.2529417872428894, 0.2493167072534561, 0.1228863969445229, -0.6289532780647278, 0.2511698901653290, -0.1145481690764427, -0.1931190490722656));
	target1 += mul(ml1, float4x4(0.0802633240818977, -0.2823207676410675, -0.0453533977270126, 0.2149281948804855, 0.2391699999570847, -0.3012505769729614, -0.0672336667776108, 0.1134754717350006, -0.1874532252550125, 0.2426864057779312, 0.0001024203302222, -0.2685940861701965, -0.2393693625926971, -0.0148733090609312, 0.4815890491008759, -0.5666245818138123));
	target1 += mul(ml2, float4x4(0.2353847920894623, 0.7481728792190552, 0.0613395981490612, -0.3136185705661774, -0.6453479528427124, 0.2987860739231110, -0.1935778856277466, -0.4407877624034882, -0.1155721992254257, -0.8143445253372192, -0.1829861551523209, 0.0808847546577454, 0.3689287006855011, -0.1318729221820831, 0.1494798213243484, -0.7250000834465027));
	target1 += mul(bl1, float4x4(-0.2325237691402435, -0.0383906811475754, -0.0762876123189926, 0.0158057715743780, -0.3222318589687347, -0.0946261659264565, -0.1157991588115692, 0.2080847620964050, -0.1521182358264923, 0.4038263857364655, -0.2508496940135956, 0.0620750486850739, 0.1382832378149033, -0.1788915544748306, -0.1054779291152954, -0.1079574525356293));
	target1 += mul(bl2, float4x4(0.0441149584949017, -0.1473216116428375, 0.1350974887609482, -0.2101743519306183, 0.4860914349555969, -0.0438372306525707, 0.1496813595294952, 0.1337997019290924, 0.2939592599868774, -0.2875731289386749, 0.3024089336395264, 0.2730985283851624, 0.3860357403755188, -0.2070714235305786, 0.0471001267433167, 0.0515876151621342));
	target1 += mul(tc1, float4x4(-0.0282726809382439, -0.0808876901865005, -0.1294671446084976, 0.0327957235276699, 0.2005466669797897, 0.0710152760148048, -0.2951858937740326, 0.1284866034984589, -0.3245949447154999, -0.2784474790096283, -0.0651488602161407, 0.2024642229080200, -0.1790685206651688, -0.1523464322090149, 0.0683958381414413, -0.0721997469663620));
	target1 += mul(tc2, float4x4(0.2084605693817139, 0.2224501073360443, 0.2990169227123260, -0.0822417438030243, 0.1661120802164078, 0.2242873460054398, 0.3008987009525299, -0.0589924007654190, 1.0089585781097412, 0.3364263474941254, 0.3114744126796722, -0.4205997586250305, 0.2149223387241364, -0.2686808407306671, 0.6869788169860840, 0.0397010855376720));
	target1 += mul(mc1, float4x4(-0.1719545274972916, 0.2342357635498047, -0.1108281537890434, 0.0051285717636347, -0.5348495244979858, -0.0063809715211391, -0.2947000265121460, 0.0092384787276387, 0.1788431107997894, -0.8757466077804565, -0.0199933666735888, -0.0933040529489517, -1.1017562150955200, -1.1397477388381958, -0.8490890264511108, 2.0844755172729492));
	target1 += mul(mc2, float4x4(-0.7517850399017334, -0.6626257300376892, -1.7181873321533203, 1.3924138545989990, 0.3148886561393738, 1.2373961210250854, 0.8413697481155396, 0.2569177746772766, 0.1905626207590103, -0.8806108832359314, 0.7340399026870728, 1.8838906288146973, -0.1782593727111816, 0.3429502546787262, -0.3488911390304565, -0.6653195619583130));
	target1 += mul(bc1, float4x4(0.1612574905157089, -0.0092199165374041, -0.2294603884220123, 0.2070839852094650, 0.1995067894458771, -0.1586991697549820, -0.1423671096563339, 0.1524601876735687, 0.6368640661239624, -0.1302748024463654, 0.2046667486429214, 0.4024843573570251, 0.3522947132587433, 1.0427794456481934, -0.4195784628391266, -0.7421376705169678));
	target1 += mul(bc2, float4x4(-0.0139375794678926, 0.0099870329722762, 0.1957603991031647, 0.2892707288265228, -0.0361699834465981, 0.5173625946044922, -0.0569337680935860, 0.1873179972171783, -1.0557887554168701, 0.2226557582616806, 0.0604930445551872, 0.3329092264175415, -0.1138753890991211, 0.3199435174465179, 0.0987524166703224, 0.1584812700748444));
	target1 += mul(tr1, float4x4(-0.1148171499371529, 0.0505522675812244, -0.1067250370979309, 0.0587497279047966, -0.3531772792339325, -0.0130594912916422, -0.0051763984374702, 0.0720054879784584, -0.2512235343456268, 0.5235862731933594, 0.1203625276684761, 0.0220968686044216, 0.5066124200820923, -0.2726359069347382, 0.3687904477119446, -0.3189409077167511));
	target1 += mul(tr2, float4x4(0.2119312435388565, -0.0852348133921623, 0.1512662768363953, 0.0316264666616917, 0.2671527862548828, 0.2980401515960693, -0.1022484675049782, -0.1188400015234947, 0.1485718190670013, 0.2684609889984131, 0.1905853003263474, -0.1392537802457809, 0.4225537180900574, 0.0611033178865910, -0.0134558668360114, -0.2009256333112717));
	target1 += mul(mr1, float4x4(-0.0571580827236176, -0.0214836131781340, -0.2749050855636597, 0.0790889635682106, -0.0811165198683739, -0.2081381976604462, -0.3047288656234741, 0.0027117941062897, -0.2009213417768478, -0.7622461318969727, -0.4736055433750153, 0.2779547572135925, 0.4795901477336884, 0.7571166157722473, 1.2372496128082275, -0.7046401500701904));
	target1 += mul(mr2, float4x4(-0.1537595987319946, 0.3831464648246765, -0.1571276187896729, -0.1867597997188568, 0.6104238033294678, 0.0229409243911505, 0.2101978808641434, -0.1350114792585373, 0.4454170167446136, 0.3683053851127625, -0.3370352983474731, -0.3944822549819946, -0.4503754675388336, -0.4261152446269989, 0.5954129695892334, 0.0479046516120434));
	target1 += mul(br1, float4x4(0.2166123390197754, 0.0542660057544708, 0.1477318406105042, 0.1632562726736069, 0.4279211461544037, 0.2100527286529541, -0.0213893372565508, 0.3388189971446991, 0.2154107838869095, -0.2220560610294342, -0.0808312967419624, 0.1158433631062508, -0.1609301865100861, 0.4950682818889618, 0.4356543123722076, -0.0398453846573830));
	target1 += mul(br2, float4x4(0.0525114983320236, 0.1031088456511497, -0.2437869310379028, -0.1797652095556259, -0.2139296382665634, -0.0230520907789469, 0.0188236199319363, 0.2274840623140335, 0.2199348062276840, -0.0977248921990395, 0.0661730542778969, -0.1618098169565201, -0.1215345263481140, -0.2456843554973602, -0.1434712260961533, -0.1226665675640106));
	target1 = max(target1, 0) + float4(-0.9874631166458130, 0.2700935602188110, 1.0154639482498169, 0.7301973104476929) * min(target1, 0);

	float4 target2 = float4(0.0170604288578033, -0.0097856530919671, 0.0882583037018776, 0.0158541873097420);
	target2 += mul(tl1, float4x4(0.1967887729406357, -0.0514578297734261, 0.0351365692913532, -0.0027727256529033, 0.0978068783879280, -0.3902593851089478, 0.0123964082449675, -0.1210777312517166, 0.3820665776729584, -0.5003674030303955, 0.0546922460198402, -0.0777103230357170, 0.9743819236755371, -0.3239430189132690, -0.2496883124113083, 0.1733392328023911));
	target2 += mul(tl2, float4x4(-0.1924884468317032, 0.5075340867042542, -0.0542841143906116, 0.0434595011174679, 0.1881206482648849, -0.1774751842021942, -0.1752865165472031, 0.0315622761845589, -0.4268247485160828, 0.3984751403331757, -0.0704891532659531, -0.0969574451446533, 0.1777588576078415, 0.2743588685989380, -0.0317226983606815, -0.1993683725595474));
	target2 += mul(ml1, float4x4(-0.1353080570697784, -0.0161478724330664, 0.0713945776224136, 0.1482806354761124, -0.0025943452492356, -0.0296892002224922, 0.1426411569118500, 0.3263220191001892, 0.3354269266128540, -0.0851829424500465, 0.1781585812568665, -0.0421005003154278, -0.5350970029830933, -1.3111218214035034, 0.2340501397848129, -0.9513134956359863));
	target2 += mul(ml2, float4x4(-0.0965117588639259, -0.3247327506542206, 0.1102668121457100, -0.1604842394590378, 0.4602060914039612, 0.0756718367338181, -0.3309438228607178, -0.3001569509506226, 0.2267884165048599, 0.1377216577529907, -0.0426548905670643, 0.1272846758365631, -0.1080727055668831, -0.4640344679355621, -0.3294694125652313, -0.2043451815843582));
	target2 += mul(bl1, float4x4(0.0720937326550484, -0.1846759617328644, -0.1708657741546631, -0.0725364983081818, 0.1260499358177185, -0.1162428930401802, -0.2501497566699982, -0.1012131050229073, 0.2760527133941650, -0.0939920768141747, 0.2875119149684906, 0.0406376719474792, 0.1008657962083817, -0.1240409687161446, -0.4121425449848175, 0.3269978761672974));
	target2 += mul(bl2, float4x4(0.0125542022287846, 0.2582587003707886, 0.1726561784744263, 0.0180624593049288, 0.0579377673566341, -0.0663961246609688, -0.0078865075483918, -0.0506187379360199, -0.0882500410079956, -0.0282228980213404, -0.1616529822349548, -0.1816279888153076, 0.1482390761375427, -0.3260181546211243, 0.1757252663373947, 0.1394872069358826));
	target2 += mul(tc1, float4x4(-0.0768914818763733, -0.1610976904630661, -0.0581125281751156, 0.1043644994497299, -0.0832794085144997, 0.1854220479726791, -0.0097211552783847, 0.2851990759372711, -0.6227292418479919, -0.2649715840816498, 0.4023403823375702, -0.2051993161439896, 0.4956052303314209, 0.8367735743522644, 0.2145122885704041, 0.3317213356494904));
	target2 += mul(tc2, float4x4(0.9697892665863037, -0.2038540095090866, -0.3166446983814240, -0.0504710040986538, 0.1532189846038818, 0.3050784170627594, -0.1300316900014877, -0.2059933692216873, 0.4894859492778778, -0.3882815837860107, -0.7273328304290771, 0.3463444113731384, -0.8791087865829468, -0.2762917280197144, -0.2789021134376526, -0.1907687485218048));
	target2 += mul(mc1, float4x4(0.1201086342334747, 0.1047629937529564, -0.3030976653099060, 0.0362001918256283, -0.2728919386863708, -0.5267004966735840, 0.1090360283851624, -0.2792145609855652, -0.3083780109882355, 0.2616442143917084, 0.4801669716835022, 0.1518263220787048, -1.6350433826446533, 0.7567611336708069, -1.9801075458526611, 1.1116229295730591));
	target2 += mul(mc2, float4x4(-0.6893532276153564, -0.3531652688980103, -0.1891958266496658, 1.7840391397476196, 1.4943064451217651, 0.7292221188545227, -1.7417374849319458, 0.0550648272037506, -0.1120251268148422, 0.8250336647033691, 1.1355321407318115, -1.1124770641326904, -0.2689424455165863, 0.9269363284111023, 1.9637582302093506, -0.3709079027175903));
	target2 += mul(bc1, float4x4(-0.0544882826507092, -0.1718381494283676, -0.1280352175235748, 0.1454906165599823, -0.1176344379782677, 0.0151336872950196, -0.2567785680294037, 0.2511477768421173, 0.3610197603702545, 0.5843607783317566, 0.0848828330636024, 0.1415835469961166, 0.4969498217105865, 0.5772764682769775, 0.5247990489006042, -0.0598939247429371));
	target2 += mul(bc2, float4x4(0.0529166162014008, -0.1535185724496841, -0.1523050367832184, -0.0501741841435432, -0.0633302107453346, -0.3949260413646698, 0.1972121149301529, -0.2604303061962128, -0.1828030794858932, 0.2246686667203903, -0.6004081368446350, 0.0432657450437546, -0.2086566388607025, 0.2799777686595917, 0.2933793962001801, -0.0253354366868734));
	target2 += mul(tr1, float4x4(-0.0009011612855829, -0.0231834072619677, -0.0538895800709724, -0.0315926298499107, -0.1181581020355225, -0.2161513417959213, -0.2679739892482758, -0.0537310577929020, 0.3116895258426666, -0.1043426766991615, -0.4458046257495880, 0.0201701205223799, -0.8122410774230957, -0.5180496573448181, -0.1277437359094620, -0.1671603769063950));
	target2 += mul(tr2, float4x4(0.2782520055770874, 0.2888738214969635, 0.2265798002481461, 0.0807978361845016, -0.1869603991508484, -0.2283953428268433, -0.4374879896640778, 0.2510242760181427, 0.4857149720191956, -0.1796883046627045, 0.2519723176956177, 0.1950220316648483, -0.0102280136197805, -0.4034306704998016, -0.2963733077049255, 0.4622495174407959));
	target2 += mul(mr1, float4x4(0.0423482730984688, -0.2923308312892914, -0.0082442639395595, 0.2373020350933075, 0.1710588335990906, 0.0392467305064201, 0.0116449045017362, 0.3267012536525726, -0.8832122683525085, 0.7230877280235291, -0.2364437282085419, -0.5563997030258179, 0.2307662665843964, -1.0188170671463013, 0.3680693507194519, -0.8103905916213989));
	target2 += mul(mr2, float4x4(-0.1161488518118858, -0.6775091290473938, -0.0844684988260269, -0.3587656021118164, -0.0190705843269825, -0.5557464957237244, 0.3721883893013000, 0.1322396695613861, -0.1157554760575294, -0.1545359939336777, 0.4236145615577698, -0.9936751127243042, -0.0805041715502739, -0.2628504335880280, 0.1407603323459625, -0.4183281958103180));
	target2 += mul(br1, float4x4(-0.0447837486863136, 0.0488513521850109, -0.1358503550291061, 0.0202376656234264, -0.0258089359849691, -0.2975459396839142, -0.1580457836389542, 0.1025377139449120, -0.0301857776939869, -0.1071514338254929, -0.0150549048557878, 0.0794499814510345, 0.4558653235435486, 0.8186704516410828, 0.2287982851266861, -0.1438317447900772));
	target2 += mul(br2, float4x4(0.1536326855421066, 0.2366072386503220, 0.0591898001730442, -0.0547822229564190, 0.1068296432495117, -0.0342746265232563, -0.1388098448514938, 0.3756637275218964, 0.2406303733587265, 0.1881252676248550, -0.1518276780843735, 0.1872117221355438, 0.1484777033329010, 0.1825126409530640, -0.1781855672597885, 0.1650572419166565));
	target2 = max(target2, 0) + float4(-0.5060276985168457, -1.3971502780914307, 0.7606850862503052, -0.5118398666381836) * min(target2, 0);

	tex1[gxy] = target1;
	tex2[gxy] = target2;
}


//!PASS 3
//!DESC mapping 2
//!IN tex1, tex2
//!OUT tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass3(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = tex1.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = tex1.SampleLevel(sam, pos, 0);
	float4 bc1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = tex1.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = tex2.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = tex2.SampleLevel(sam, pos, 0);
	float4 bc2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = tex2.SampleLevel(sam, pos + inputPt, 0);

	float4 target1 = float4(0.2456959486007690, 0.1773831695318222, -0.0800321474671364, -0.1356369554996490);
	target1 += mul(tl1, float4x4(0.0867862403392792, -0.0188770499080420, -0.1502736657857895, -0.1099725291132927, -0.3013791441917419, 0.0430033504962921, 0.0345034115016460, -0.0400283746421337, 0.0455053038895130, -0.0785564482212067, -0.1695308536291122, 0.0467130616307259, -0.0208115540444851, 0.0026763146743178, -0.1338489353656769, -0.1844547539949417));
	target1 += mul(tl2, float4x4(0.1620235443115234, 0.0102646192535758, -0.0163768343627453, 0.0822434723377228, 0.1546859890222549, 0.0250307265669107, 0.0373145006597042, -0.0117816952988505, 0.0517709590494633, -0.0252467226237059, -0.1536794751882553, -0.0202652048319578, -0.3231309056282043, 0.1350613087415695, 0.1932685226202011, 0.1793868690729141));
	target1 += mul(ml1, float4x4(-0.5794479846954346, 0.1861644536256790, 0.1153499931097031, 0.1069827228784561, 0.4245908558368683, 0.1373304873704910, -0.1780052036046982, 0.0466761402785778, -1.1490619182586670, 0.8257195353507996, 0.0008257642621174, 0.0699498802423477, 0.3857855796813965, 0.1599738448858261, -0.0160159282386303, -0.1072350814938545));
	target1 += mul(ml2, float4x4(-0.0448461174964905, -0.1027067080140114, 0.1546361148357391, -0.1321994811296463, 0.3319362998008728, 0.0670638754963875, -0.0980701223015785, -0.1242648735642433, 0.0487120002508163, 0.1034812107682228, -0.3318608701229095, -0.0663819089531898, -0.7049940824508667, -0.2681597173213959, 0.5434955358505249, 0.3798713982105255));
	target1 += mul(bl1, float4x4(-0.1650677323341370, -0.1774029582738876, -0.0693891644477844, 0.0644233599305153, 0.0096654882654548, 0.0590313524007797, -0.0626199543476105, -0.1101114749908447, -0.0919653624296188, -0.2198607474565506, -0.3686812222003937, -0.0030939118005335, -0.0286871381103992, -0.0267137177288532, -0.2114386558532715, -0.1006813868880272));
	target1 += mul(bl2, float4x4(-0.1621828079223633, -0.3327856957912445, -0.3447196483612061, -0.0941574051976204, 0.1980617940425873, -0.0039776018820703, 0.0631400719285011, -0.0371704883873463, 0.0801121518015862, -0.2228745818138123, -0.1361533999443054, -0.0061448244377971, -0.2867666780948639, -0.0285903755575418, -0.2431204020977020, 0.0123175233602524));
	target1 += mul(tc1, float4x4(-0.0213332772254944, 0.0521896183490753, 0.1267389953136444, 0.0592065081000328, 0.2370698899030685, -0.0788677260279655, -0.0918647274374962, 0.0576282069087029, -0.0021516692359000, -0.1609319597482681, 0.1739181131124496, 0.4105915725231171, -0.0331462696194649, 0.0667985677719116, 0.0239557847380638, 0.2053552418947220));
	target1 += mul(tc2, float4x4(0.0452669933438301, -0.0624216794967651, -0.2210896909236908, -0.2319414317607880, 0.0553048253059387, -0.0195244718343019, -0.0948461145162582, -0.1411011815071106, 0.1357870846986771, -0.0044978843070567, 0.0117386765778065, 0.2855076789855957, 0.0721756964921951, 0.0725790113210678, 0.0879172906279564, 0.2261598110198975));
	target1 += mul(mc1, float4x4(0.1625189036130905, 0.2556113302707672, 0.0096751591190696, 0.4720825850963593, 0.1724947541952133, 0.7794855833053589, -0.5798769593238831, -0.5527915954589844, -0.2930226027965546, -0.1513507068157196, -0.1671935617923737, 0.1259696036577225, -1.5636392831802368, -0.6248261332511902, -0.7778694629669189, 0.7258287668228149));
	target1 += mul(mc2, float4x4(-0.2203702777624130, -0.2413295805454254, 0.5873484611511230, 0.8019542694091797, 0.2896324992179871, -0.0073753874748945, -0.4705016911029816, -0.4037020802497864, -0.5916352272033691, 0.8720123767852783, 1.4509203433990479, -0.4347604215145111, 0.1296572685241699, -0.0382503382861614, 1.0146147012710571, 0.6662492156028748));
	target1 += mul(bc1, float4x4(0.1450282633304596, 0.1838535815477371, -0.2957614958286285, -0.1175402477383614, 0.0455737337470055, -0.1042275950312614, 0.2409446090459824, 0.2161487638950348, 0.2523523867130280, -0.1657318323850632, 0.3264083266258240, 0.0015554791316390, 0.0756938308477402, 0.6486120820045471, 0.2910028994083405, 0.0061267162673175));
	target1 += mul(bc2, float4x4(0.1902535557746887, -1.9667011499404907, 0.5702443718910217, 0.1346294432878494, -0.1391871571540833, 0.0248745214194059, -0.0059022656641901, -0.1015660688281059, -0.0244528464972973, 0.6021597385406494, 0.1408251225948334, 0.1453502923250198, 0.1481679081916809, -0.0782008916139603, 0.1340244859457016, 0.2094520628452301));
	target1 += mul(tr1, float4x4(0.0108401505276561, 0.0073569868691266, 0.0448315776884556, 0.0920599550008774, -0.0208782758563757, -0.0072226687334478, 0.1590231209993362, 0.0974729061126709, -0.0504082255065441, -0.1492372304201126, -0.0240942239761353, -0.3387995064258575, -0.0284501910209656, -0.0475567393004894, -0.1351375281810760, -0.0968868359923363));
	target1 += mul(tr2, float4x4(-0.0175144840031862, 0.0215768050402403, -0.2070538252592087, -0.1020313948392868, -0.0632536634802818, 0.0187655575573444, 0.0033090459182858, 0.0483726076781750, 0.0874270573258400, 0.0391933582723141, -0.0733725428581238, 0.0455813333392143, 0.0519542098045349, -0.0167136136442423, 0.0001770213857526, -0.0226714108139277));
	target1 += mul(mr1, float4x4(0.0230981707572937, 0.0211336743086576, -0.0202524177730083, 0.0004777485737577, -0.3133100867271423, -0.2222708314657211, -0.3225338459014893, 0.0252504348754883, -0.1565012782812119, -0.1223759651184082, -0.1677924543619156, 0.1436173915863037, -0.1002913638949394, -0.4352810978889465, -0.1214068830013275, 0.1200122535228729));
	target1 += mul(mr2, float4x4(0.2746300697326660, 0.0240563396364450, 0.2214205712080002, -0.0140676703304052, -0.1697816103696823, 0.0239461977034807, -0.2184012532234192, -0.1122284159064293, -0.0025032388512045, -0.1982196122407913, -0.0088773546740413, -0.0592936985194683, 0.0981788560748100, 0.0590783730149269, 0.1699221283197403, 0.1146017014980316));
	target1 += mul(br1, float4x4(-0.1190557554364204, 0.0139884017407894, -0.3765408396720886, -0.1967576593160629, -0.0013050300767645, 0.0838785469532013, 0.0467342510819435, 0.0197970345616341, 0.0199079178273678, 0.1127095147967339, -0.0382974669337273, -0.0808331072330475, 0.0045804185792804, 0.1423084437847137, 0.0275978501886129, 0.0051016276702285));
	target1 += mul(br2, float4x4(0.0694821104407310, -0.1185832619667053, 0.1340767890214920, -0.0096760904416442, -0.0057105780579150, -0.0358094684779644, -0.0208928529173136, -0.0422658622264862, -0.1662766784429550, 0.0397685728967190, -0.0169682707637548, 0.1427496373653412, 0.1324639916419983, 0.0579542480409145, 0.1712465286254883, 0.1062873229384422));
	target1 = max(target1, 0) + float4(0.0476732961833477, -0.0824369415640831, 1.4746414422988892, 1.6789640188217163) * min(target1, 0);

	float4 target2 = float4(-0.0375947281718254, 0.2783663868904114, 0.0855874642729759, -0.0183580406010151);
	target2 += mul(tl1, float4x4(-0.3375159502029419, -0.0481248162686825, 0.0022695809602737, -0.1379150450229645, 0.2087368816137314, -0.1413425505161285, 0.0311671234667301, 0.2090687304735184, -0.1255441159009933, -0.3856352567672729, 0.0592494457960129, -0.2192105948925018, 0.0635740235447884, -0.0259831510484219, 0.1284605711698532, 0.1543060839176178));
	target2 += mul(tl2, float4x4(0.0265662875026464, 0.1603409945964813, -0.0106395082548261, 0.0252655427902937, 0.0633112043142319, 0.1634869277477264, 0.0606260225176811, -0.0386067330837250, 0.1025275588035583, -0.0086877709254622, 0.0572752207517624, 0.2958410382270813, 0.2315495908260345, -0.0511345490813255, -0.0684579163789749, 0.2366850525140762));
	target2 += mul(ml1, float4x4(-0.6637977361679077, 0.1115299314260483, 0.0334465689957142, -0.0595322623848915, 0.0194256473332644, 0.1154914125800133, -0.0093330284580588, -0.2107555270195007, 0.2593949139118195, -0.2310725152492523, -0.0191440880298615, 0.0831847414374352, 0.0869263112545013, 0.1271044909954071, -0.0199039578437805, 0.0421413294970989));
	target2 += mul(ml2, float4x4(0.1171221211552620, -0.2125719487667084, -0.0189515724778175, 0.2465390264987946, 0.1773879528045654, 0.2518055438995361, 0.0552976131439209, -0.1894477456808090, 0.1769066900014877, -0.1464872211217880, -0.0573948174715042, -0.4012156426906586, 0.2111275196075439, -0.5377770662307739, -0.2866773009300232, 0.1336809694766998));
	target2 += mul(bl1, float4x4(-0.6472494006156921, -0.0555078461766243, 0.0564644038677216, 0.0711399838328362, -0.0228650532662868, -0.0755083113908768, 0.0132119813933969, 0.1565485745668411, 0.0769101306796074, -0.4400988519191742, -0.0369989611208439, -0.0459617786109447, 0.1246264874935150, -0.2121030986309052, 0.0351070538163185, 0.1162980273365974));
	target2 += mul(bl2, float4x4(-0.0239488855004311, -0.4389697015285492, -0.0041466108523309, 0.2026203870773315, 0.0299914367496967, 0.0214463528245687, -0.0340079553425312, -0.0866646468639374, -0.1258078664541245, 0.0335666500031948, 0.0279387012124062, 0.0377361401915550, -0.0037173877935857, -0.1970001310110092, 0.0554011650383472, 0.0747631862759590));
	target2 += mul(tc1, float4x4(-0.5669959783554077, -0.0150139974430203, -0.0079386057332158, -0.1156958788633347, -0.0749717876315117, 0.1512815952301025, -0.0340143367648125, -0.1504366695880890, -0.1540268361568451, -0.0089722918346524, -0.0974140912294388, -0.4191842377185822, 0.0414282791316509, -0.0518460534512997, -0.1025082096457481, -0.1974052190780640));
	target2 += mul(tc2, float4x4(0.1328157931566238, -0.0447603911161423, 0.0625142455101013, 0.1125901266932487, 0.1033857688307762, 0.0811788439750671, 0.1270843595266342, -0.0564684942364693, 0.0797754079103470, 0.1379490494728088, -0.2863929569721222, -0.0602805763483047, 0.2699469923973083, 0.1226278319954872, -0.2505964636802673, 0.0639543756842613));
	target2 += mul(mc1, float4x4(-1.5813068151473999, 0.5872991085052490, -0.2429279834032059, -0.4303708970546722, 0.2854560911655426, 1.0167927742004395, 0.8617131114006042, 0.2191447615623474, 0.9627910852432251, 0.7867327332496643, 1.2628984451293945, 0.8908280134201050, -0.4586973786354065, -0.7981753349304199, 0.4780183732509613, -0.9264264106750488));
	target2 += mul(mc2, float4x4(0.2435170710086823, -0.0829131007194519, -0.3455205559730530, 0.4117922484874725, 0.2749316394329071, 0.1895177811384201, 0.4110289216041565, -0.1298204958438873, 0.1637304723262787, 0.8604004383087158, 1.0940867662429810, -0.3959148228168488, 0.3289682567119598, -0.0633709058165550, -2.0705056190490723, 0.1684481352567673));
	target2 += mul(bc1, float4x4(-0.8055392503738403, 0.2874773740768433, -0.1400482803583145, -0.1834644526243210, 0.0150187248364091, 0.0192099008709192, -0.0783268958330154, -0.2944276928901672, 0.0451190918684006, 0.1181604787707329, 0.1095703318715096, -0.2282790690660477, 0.1960140317678452, 0.3371279239654541, 0.0243086088448763, -0.0463834926486015));
	target2 += mul(bc2, float4x4(0.2196981906890869, -0.0534196794033051, -0.0839012116193771, 0.2049407809972763, 0.0194450635462999, -0.0593264624476433, 0.1640597432851791, 0.0274629276245832, -0.1243807971477509, 0.0611803941428661, -0.1799024045467377, -0.1864561140537262, 0.2465235143899918, -0.0211831126362085, -0.2282803803682327, -0.1430586874485016));
	target2 += mul(tr1, float4x4(-0.3611976802349091, 0.0288475938141346, -0.0297703798860312, -0.0418547466397285, -0.3251218497753143, -0.0134126413613558, -0.0686949566006660, -0.0233805924654007, -0.2749838531017303, -0.2486374378204346, 0.0724888965487480, 0.1193816959857941, -0.2721751034259796, -0.2033173292875290, 0.0248280912637711, 0.0589503161609173));
	target2 += mul(tr2, float4x4(0.1689156740903854, 0.0712056383490562, 0.1930764019489288, 0.0722641199827194, 0.0640723854303360, 0.0566449724137783, 0.0815568938851357, -0.0213705692440271, -0.1826065927743912, 0.0393006950616837, -0.1493768393993378, 0.0386883616447449, -0.0130320172756910, -0.0327960774302483, -0.0204591657966375, 0.0134796360507607));
	target2 += mul(mr1, float4x4(-0.5736998319625854, -0.0392777882516384, 0.1370634734630585, -0.0484432727098465, 0.1308025121688843, -0.2323654592037201, -0.2625242173671722, -0.2956316471099854, -0.1103305667638779, -0.0551420338451862, 0.0006514643318951, 0.0022458140738308, 0.2859890162944794, -0.0839410424232483, 0.5223253369331360, 0.0280438754707575));
	target2 += mul(mr2, float4x4(0.0556896403431892, 0.0735942423343658, -0.2387326955795288, 0.1338670998811722, 0.0996377170085907, 0.0365633517503738, 0.3044275343418121, -0.0164738632738590, 0.1139278411865234, 0.1249758303165436, 0.2395293861627579, -0.0708516016602516, 0.1228865459561348, 0.0634353235363960, -0.3463226258754730, -0.0362484715878963));
	target2 += mul(br1, float4x4(-0.4082182049751282, -0.1144043654203415, 0.0233679264783859, 0.0130491442978382, 0.0237790253013372, 0.0709472149610519, 0.1275831013917923, -0.0888639837503433, -0.0140889342874289, -0.1301848441362381, -0.1709596514701843, 0.0314525589346886, 0.0293366052210331, 0.0934117212891579, 0.0720594301819801, 0.0094668027013540));
	target2 += mul(br2, float4x4(0.1782542318105698, -0.1532294601202011, -0.2839424610137939, 0.0435897931456566, 0.0621095262467861, -0.0348550342023373, 0.0461588650941849, 0.0183234252035618, 0.2204841077327728, 0.1267120093107224, 0.1979495882987976, -0.2149147540330887, 0.2110942006111145, 0.0718472301959991, -0.1063910648226738, -0.0493422709405422));
	target2 = max(target2, 0) + float4(-0.0162308197468519, 0.4942881166934967, 0.1156802847981453, 1.4069133996963501) * min(target2, 0);

	tex3[gxy] = target1;
	tex4[gxy] = target2;
}


//!PASS 4
//!DESC mapping 3
//!IN tex3, tex4
//!OUT tex1, tex2
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass4(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = tex3.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = tex3.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = tex3.SampleLevel(sam, pos, 0);
	float4 bc1 = tex3.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = tex3.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = tex4.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = tex4.SampleLevel(sam, pos, 0);
	float4 bc2 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = tex4.SampleLevel(sam, pos + inputPt, 0);

	float4 target1 = float4(0.1563357263803482, -0.0187121629714966, -0.0126413907855749, 0.1197946891188622);
	target1 += mul(tl1, float4x4(0.0248779058456421, 0.0970748737454414, -0.0710923895239830, -0.0502988137304783, -0.1513994187116623, 0.2118151038885117, -0.1060286536812782, -0.0017144688172266, -0.0556356199085712, -0.0478693507611752, 0.1046469956636429, 0.0019442490302026, -0.1099060326814651, 0.0196554642170668, -0.0393092185258865, -0.0131395589560270));
	target1 += mul(tl2, float4x4(0.4478245079517365, -0.0910880789160728, -0.2786923944950104, -0.0162817053496838, 0.0112394141033292, -0.1194593086838722, 0.0425493344664574, -0.0044937161728740, -0.1628813296556473, -0.2678257226943970, 0.2309084981679916, 0.0530976355075836, -0.1750707328319550, -0.0064609451219440, -0.0241944380104542, 0.0247293189167976));
	target1 += mul(ml1, float4x4(-0.0999718829989433, 0.2529200315475464, -0.2410994917154312, -0.0778252258896828, -0.1284751892089844, 0.0736103430390358, -0.0542359724640846, -0.0292810052633286, 0.0842959657311440, 0.1324738860130310, -0.1200775727629662, 0.0923810601234436, -0.3018241524696350, -0.0444923602044582, -0.2015913128852844, -0.0204973872750998));
	target1 += mul(ml2, float4x4(-0.3240330517292023, -0.5515946745872498, -0.0859212949872017, 0.0462087914347649, -0.0053406502120197, -0.0506816878914833, 0.1402591317892075, 0.0343847945332527, -0.1610218435525894, 0.7799831628799438, -0.4482840299606323, 0.4723005294799805, -0.2119023054838181, 0.1128389388322830, -0.1832690685987473, -0.0362148694694042));
	target1 += mul(bl1, float4x4(-0.0930702313780785, -0.0642902702093124, -0.0367766097187996, -0.0020038220100105, 0.0453414916992188, 0.0075245667248964, -0.0119760576635599, 0.0035134663339704, 0.1123304143548012, 0.1128236204385757, 0.0063381437212229, -0.0171416625380516, -0.1588087528944016, -0.2571538090705872, 0.2774460315704346, 0.0171892605721951));
	target1 += mul(bl2, float4x4(-0.0579603239893913, 0.3495539724826813, -0.4717904627323151, -0.0042169657535851, -0.0763773098587990, -0.1721021682024002, 0.1467801630496979, -0.0172323398292065, -0.0802723765373230, -0.0116639221087098, 0.0178409330546856, -0.0121179902926087, -0.0710472315549850, -0.1480658054351807, 0.1912731230258942, 0.0008336952887475));
	target1 += mul(tc1, float4x4(0.0445428267121315, -0.1481070518493652, 0.1304957717657089, -0.1157935336232185, -0.1166803613305092, -0.3920064568519592, 0.8788923621177673, 0.3384790122509003, -0.0278418567031622, -0.1011456921696663, -0.0513025075197220, -0.0748791247606277, -0.0502855703234673, -0.1696091145277023, -0.0049756630323827, -0.0209914986044168));
	target1 += mul(tc2, float4x4(-0.1725586950778961, 0.1313660293817520, -0.3432282805442810, -0.0246748421341181, -0.0593904331326485, 0.1949618458747864, -0.2589366734027863, 0.0379127524793148, -0.0550928115844727, -0.0913493037223816, 0.1150950565934181, -0.1235521435737610, -0.0625528916716576, -0.3131158649921417, 0.4109992682933807, 0.0410865694284439));
	target1 += mul(mc1, float4x4(0.8893174529075623, -0.0700975209474564, 0.7708584070205688, -0.2465052455663681, -0.1121069490909576, -0.5598245263099670, 0.7997139692306519, -0.5694547295570374, -0.1351616084575653, 0.1108073145151138, 0.7269443273544312, 0.1369582563638687, 0.6071134805679321, 0.8817817568778992, 0.0194139964878559, -0.2244683355093002));
	target1 += mul(mc2, float4x4(-0.2949840426445007, -0.3180212676525116, -0.5362266302108765, 0.0656562000513077, -0.6400785446166992, -0.4476518630981445, -0.6344851851463318, 0.7187259793281555, -0.3846258223056793, 1.0990517139434814, -0.7282652258872986, -0.6530264616012573, 0.8294114470481873, 0.6079595088958740, 0.3271140158176422, 0.4062923491001129));
	target1 += mul(bc1, float4x4(-0.0373790934681892, -0.1651912927627563, 0.0589407421648502, 0.0622759014368057, 0.0095487469807267, 0.0824478641152382, -0.0216544214636087, 0.1070290282368660, -0.0805450007319450, -0.0367405600845814, 0.0055392896756530, 0.0046677836216986, 0.1806629896163940, 0.2809534966945648, 0.0341635458171368, 0.1274557113647461));
	target1 += mul(bc2, float4x4(0.3259792327880859, -0.3150677680969238, -0.2272015213966370, 0.0287732314318419, 0.0530966222286224, 0.3310768604278564, -0.2079527378082275, -0.1340134441852570, 0.0769909769296646, -0.2229669988155365, 0.1012685745954514, 0.0622584670782089, 0.1539722383022308, 0.2163516432046890, -0.1021269038319588, -0.0561319366097450));
	target1 += mul(tr1, float4x4(-0.1077229678630829, -0.2074016332626343, 0.0913541764020920, 0.0391069389879704, 0.0848263725638390, -0.0416730083525181, 0.0603712275624275, 0.0457836911082268, 0.0035252417437732, 0.0004963557003066, 0.0027605029754341, 0.0254582706838846, -0.0146415829658508, 0.0273043140769005, 0.0692857503890991, 0.0091926595196128));
	target1 += mul(tr2, float4x4(0.0692942291498184, -0.4098799824714661, 0.3745719194412231, -0.0331038050353527, -0.0513759665191174, 0.0989063531160355, -0.1431623697280884, -0.0274865441024303, 0.0244991369545460, -0.0112041812390089, 0.0523535087704659, 0.0222812052816153, -0.0314176008105278, 0.2347036451101303, -0.0928338095545769, -0.0338262394070625));
	target1 += mul(mr1, float4x4(0.3805117309093475, -0.1917886883020401, 0.2292910665273666, 0.3065188527107239, -0.2231798321008682, 0.2646720707416534, -0.1371945887804031, -0.0272636637091637, 0.1435333937406540, -0.0137438504025340, 0.0088603384792805, -0.0633594989776611, -0.1662645787000656, 0.2498313635587692, -0.2899549007415771, 0.0460192002356052));
	target1 += mul(mr2, float4x4(0.1833423078060150, 0.0624732412397861, -0.3103306889533997, -0.0102488445118070, 0.0073305973783135, -0.2617286443710327, 0.2580088973045349, -0.0416168905794621, 0.1506632268428802, -0.0574487410485744, 0.0778761878609657, 0.1702914088964462, -0.0307608898729086, 0.0848424360156059, -0.1303439885377884, -0.0837477520108223));
	target1 += mul(br1, float4x4(0.0605936460196972, -0.0835580825805664, 0.0067690783180296, 0.0539834238588810, 0.0881687626242638, -0.0001589829771547, -0.0706917122006416, 0.0060382266528904, 0.1218314692378044, 0.0132934488356113, 0.0503435060381889, -0.0386124141514301, -0.1492055207490921, -0.0103553524240851, -0.0697906538844109, -0.0208332743495703));
	target1 += mul(br2, float4x4(-0.2907077968120575, -0.1428615152835846, -0.1178332567214966, 0.0093302968889475, -0.0501379445195198, 0.1940260678529739, -0.0139665808528662, 0.0440400391817093, 0.0546711236238480, -0.0606320053339005, 0.0891899466514587, -0.0187927689403296, -0.0581561885774136, -0.0785671249032021, -0.0746953785419464, -0.0350385755300522));
	target1 = max(target1, 0) + float4(0.0636819079518318, -0.0394099690020084, 0.0154740391299129, 1.4728027582168579) * min(target1, 0);

	float4 target2 = float4(0.0120743932202458, -0.0392544493079185, 0.0073779639787972, 0.0674902275204659);
	target2 += mul(tl1, float4x4(-0.0253207311034203, -0.0178817976266146, -0.0941111445426941, -0.0096205184236169, -0.0948953703045845, -0.1085971817374229, -0.1137845888733864, -0.1022860705852509, 0.0362259782850742, 0.0741802081465721, -0.0426849052309990, 0.1004608497023582, -0.0553506910800934, -0.0631089508533478, 0.0144856451079249, -0.0129664530977607));
	target2 += mul(tl2, float4x4(0.1779767572879791, 0.0777176544070244, 0.3302779793739319, -0.0630711168050766, 0.0130759663879871, -0.0583435148000717, 0.0534219592809677, -0.0205510091036558, -0.1502479761838913, 0.0436260215938091, -0.3180699944496155, 0.1497740298509598, -0.0714024156332016, -0.0304171387106180, -0.1271478682756424, -0.0160594787448645));
	target2 += mul(ml1, float4x4(-0.2060592919588089, 0.0177838709205389, 0.2679423391819000, 0.0484818480908871, -0.1020416766405106, -0.0875749215483665, 0.2993223369121552, 0.0260893367230892, -0.0320936217904091, -0.0193585660308599, 0.1074631884694099, -0.0031519578769803, -0.1419622153043747, -0.0621272362768650, -0.2517412602901459, -0.1112222597002983));
	target2 += mul(ml2, float4x4(0.0975706353783607, -0.1846135258674622, -0.2201799452304840, -0.0123737258836627, 0.0554487742483616, -0.0255174264311790, -0.2444359511137009, -0.1069484427571297, -0.0487980805337429, -0.0570272766053677, 0.1149747893214226, -0.0176141038537025, -0.1059966161847115, 0.1263166964054108, 0.1091895326972008, 0.0400139950215816));
	target2 += mul(bl1, float4x4(-0.0971131697297096, 0.1365687996149063, -0.1780374944210052, 0.2879253029823303, -0.0652871504426003, -0.0537611208856106, -0.0763697773218155, 0.0455291420221329, 0.0246813204139471, -0.0074042826890945, 0.2309278100728989, 0.0046464367769659, -0.0692639946937561, 0.0042336005717516, -0.2525716722011566, 0.3263924717903137));
	target2 += mul(bl2, float4x4(-0.0798230320215225, -0.1135407239198685, -0.4427868127822876, 0.0395730547606945, 0.0537165030837059, 0.0225568320602179, -0.1189213171601295, -0.0707803219556808, -0.0074193109758198, -0.0493272021412849, 0.1401828378438950, 0.1580671072006226, -0.0574450828135014, 0.0058684512041509, -0.1626979410648346, 0.0857749953866005));
	target2 += mul(tc1, float4x4(-0.0715018808841705, 0.0310761369764805, -0.3861580789089203, 0.0770959705114365, 0.1908793896436691, 0.2067244797945023, 0.1176377162337303, 0.0705406218767166, -0.0944501385092735, 0.1110353469848633, -0.2772715091705322, -0.0079436022788286, 0.1045550853013992, 0.0076957782730460, 0.0220303647220135, 0.0434708297252655));
	target2 += mul(tc2, float4x4(-0.4399432241916656, -0.0885980203747749, 0.2042984664440155, 0.0499991811811924, 0.0443918742239475, -0.0322260186076164, 0.0960535407066345, 0.0173596814274788, 0.0851852819323540, -0.0549903102219105, -0.2807548046112061, -0.1112457811832428, 0.0906120762228966, 0.1066406965255737, -0.3857226073741913, 0.1345559209585190));
	target2 += mul(mc1, float4x4(0.2723454833030701, -0.1252564936876297, 0.3694194555282593, 0.0895726680755615, 0.2200681418180466, 0.3019879162311554, 0.4471587240695953, 0.2883224189281464, 0.0264542233198881, 0.3020884990692139, 0.2432236075401306, 0.5683830380439758, -0.0914180725812912, -0.1473430246114731, -0.5914288163185120, -0.1922498643398285));
	target2 += mul(mc2, float4x4(0.2161763310432434, -0.0415927544236183, -0.0378856658935547, -0.0317508913576603, -0.2287719398736954, -0.4885228574275970, -0.2818722724914551, -0.3797133862972260, 0.4456195533275604, 0.7929218411445618, -0.1307591795921326, 0.2016224861145020, 0.2801168859004974, -0.0006753758061677, 0.5686879754066467, 0.0415142513811588));
	target2 += mul(bc1, float4x4(0.0530648417770863, -0.2444190829992294, 0.0235249921679497, 0.0224611610174179, 0.0651976913213730, -0.0449720136821270, 0.1208736971020699, -0.0743656828999519, 0.1318923383951187, 0.1823218315839767, 0.5197241306304932, 0.1862808614969254, 0.2317387014627457, -0.2857755720615387, 0.1650039553642273, -0.1755792349576950));
	target2 += mul(bc2, float4x4(-0.0835669562220573, 0.0129750147461891, -0.4473843872547150, -0.5028023719787598, -0.0481940247118473, -0.0905050709843636, -0.6921447515487671, -0.2693449556827545, 0.2342379540205002, 0.0392520241439342, 0.4797120690345764, 0.1215118318796158, 0.1369755119085312, -0.1010836884379387, -0.0070533878169954, -0.2589581906795502));
	target2 += mul(tr1, float4x4(0.1088275387883186, 0.0839678123593330, -0.3048903048038483, -0.0084876483306289, 0.3669581115245819, 0.0472131110727787, -0.1243446245789528, -0.1012610718607903, 0.1622449755668640, -0.1317851245403290, -0.0711368247866631, -0.1593778431415558, -0.0104977218434215, -0.0608197152614594, 0.0286014154553413, 0.0388568006455898));
	target2 += mul(tr2, float4x4(0.0885753333568573, 0.1340429484844208, -0.0027331225574017, -0.0736069232225418, -0.1520483642816544, 0.1104429140686989, 0.1728315353393555, 0.1210049912333488, 0.0688045620918274, -0.1218316256999969, -0.0629790797829628, -0.1348981261253357, 0.0943875387310982, -0.0730865821242332, -0.2342475503683090, -0.0808216184377670));
	target2 += mul(mr1, float4x4(-0.1263358592987061, -0.7069915533065796, 0.1411920040845871, -0.2682386934757233, 0.2957956194877625, 0.1127238497138023, 0.3032427430152893, 0.2759581208229065, 0.8320354819297791, -0.0136295817792416, 0.0530097521841526, -0.0727380812168121, 0.0614950619637966, 0.0339637212455273, -0.0386842861771584, -0.0550391897559166));
	target2 += mul(mr2, float4x4(-0.0998953506350517, -0.2231116443872452, 0.0948988571763039, 0.1258799731731415, -0.6855500936508179, -0.4546283185482025, -0.3335786461830139, 0.0718025788664818, 0.6456025242805481, -0.2023779749870300, 0.1325027197599411, -0.1078727394342422, 0.3024467229843140, 0.1703380942344666, 0.2321108430624008, 0.2143797874450684));
	target2 += mul(br1, float4x4(0.0542521663010120, -0.2265717238187790, -0.0289179943501949, 0.0697252005338669, -0.1518151611089706, 0.0225123148411512, 0.0370684377849102, -0.1546901017427444, 0.0753403753042221, -0.0465561784803867, 0.1635994315147400, 0.1127668544650078, 0.0738654434680939, 0.1077028661966324, -0.1282461881637573, -0.0510208979249001));
	target2 += mul(br2, float4x4(-0.1740311384201050, 0.0542572811245918, 0.0551791004836559, 0.1728909015655518, -0.0078740902245045, 0.0999085083603859, -0.0136023676022887, 0.0501077920198441, 0.0529310964047909, -0.0859082415699959, -0.0285708475857973, -0.0186515673995018, -0.0793913751840591, 0.0688859447836876, -0.1684362143278122, 0.0473327860236168));
	target2 = max(target2, 0) + float4(-0.1339675635099411, 0.3599768280982971, -0.1313954293727875, 0.8648772835731506) * min(target2, 0);

	tex1[gxy] = target1;
	tex2[gxy] = target2;
}


//!PASS 5
//!DESC mapping 4, sub-band residuals
//!IN tex1, tex2, featureMap1, featureMap2
//!OUT tex3, tex4
//!BLOCK_SIZE 8
//!NUM_THREADS 64

void Pass5(uint2 blockStart, uint3 threadId) {
	uint2 gxy = Rmp8x8(threadId.x) + blockStart;
	uint2 inputSize = GetInputSize();
	if (gxy.x >= inputSize.x || gxy.y >= inputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 pos = (gxy + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = tex1.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = tex1.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = tex1.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = tex1.SampleLevel(sam, pos, 0);
	float4 bc1 = tex1.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = tex1.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = tex1.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = tex2.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = tex2.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = tex2.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = tex2.SampleLevel(sam, pos, 0);
	float4 bc2 = tex2.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = tex2.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = tex2.SampleLevel(sam, pos + inputPt, 0);

	float4 c1 = { -0.0466146245598793,0.0514914207160473,-0.1174036413431168,0.1775186359882355 };
	c1 += mul(tl1, float4x4(0.0403013713657856, -0.0201274380087852, -0.0814156234264374, -0.0156530365347862, -0.1377502679824829, -0.0417905971407890, 0.0382207930088043, -0.1200272291898727, 0.0445257574319839, -0.0034040021710098, -0.1461200863122940, -0.0280020851641893, -0.0284179765731096, -0.0257030930370092, -0.0306571796536446, 0.0027001383714378));
	c1 += mul(tl2, float4x4(0.0474935993552208, -0.1903701126575470, -0.5459222793579102, 0.0117317456752062, 0.1298255324363708, -0.1852943897247314, 0.0912317335605621, 0.0597700774669647, 0.0291008763015270, -0.0133294332772493, 0.2249753177165985, -0.1039957106113434, -0.0360252261161804, 0.0713268965482712, -0.1684121936559677, 0.0038015090394765));
	c1 += mul(ml1, float4x4(0.1707874685525894, -0.0190297029912472, 0.0620337612926960, 0.1826018989086151, -0.0537132881581783, 0.0105489455163479, 0.0954312533140182, -0.0787296965718269, 0.1069839373230934, 0.1045138239860535, 0.1288910657167435, 0.0561389364302158, -0.0446831695735455, 0.0814650580286980, 0.0968160405755043, -0.0053927232511342));
	c1 += mul(ml2, float4x4(-0.4036456644535065, 0.5173328518867493, 0.2669098377227783, 0.1405677497386932, 0.7028214335441589, 0.4912839531898499, 0.6029286384582520, 0.5650771260261536, 0.0271350406110287, 0.0700132623314857, -0.0723611563444138, -0.1151952818036079, -0.1905510127544403, -0.1715065985918045, 0.2840125858783722, -0.0591109022498131));
	c1 += mul(bl1, float4x4(-0.0454136617481709, 0.0026271974202245, 0.0717174336314201, -0.0211183167994022, -0.0145919620990753, -0.0671255514025688, -0.0502023696899414, 0.0087185343727469, -0.0288616847246885, -0.0113478675484657, -0.2463355064392090, 0.0117223775014281, -0.0308947954326868, -0.0319634601473808, 0.0732206851243973, -0.0482302159070969));
	c1 += mul(bl2, float4x4(0.1823509633541107, -0.0050594508647919, -0.3849277794361115, 0.0906973257660866, -0.2269264608621597, -0.1660439521074295, -0.0572436749935150, -0.2200210094451904, -0.0326488390564919, -0.0223822314292192, 0.2504203319549561, -0.0020041887182742, 0.0842133983969688, 0.0320363529026508, -0.2134571075439453, 0.0419851355254650));
	c1 += mul(tc1, float4x4(-0.1438133865594864, 0.0348416790366173, 0.0934710800647736, 0.0492126196622849, -0.2420855760574341, 0.0501662716269493, 0.1352647244930267, -0.0790266394615173, 0.2533731162548065, 0.0078659672290087, -0.6405819058418274, 0.0266417451202869, 0.1385341882705688, 0.2193202823400497, -0.0242960918694735, 0.0509849824011326));
	c1 += mul(tc2, float4x4(-0.1021846532821655, 0.0409118719398975, -0.0411306917667389, -0.0381408408284187, 0.3855873942375183, 0.0598353408277035, -0.0579320192337036, 0.0582632422447205, 0.1651727408170700, 0.1139431521296501, 0.3897903859615326, -0.2790665924549103, -0.2033874839544296, 0.0795733034610748, 0.1354800611734390, 0.0271498821675777));
	c1 += mul(mc1, float4x4(0.5974506139755249, 0.2842084765434265, 0.2237064242362976, -0.1201776340603828, 0.4645690321922302, -0.4258180558681488, 0.2686293423175812, -0.5262981653213501, -0.6491079330444336, -0.1924646943807602, 0.5328685045242310, -0.2459655404090881, -0.0878667980432510, -0.5910828113555908, 0.5153566598892212, 0.1743167340755463));
	c1 += mul(mc2, float4x4(-0.0390859283506870, 0.1246269494295120, 0.0820790305733681, -0.0892064496874809, 0.2791964113712311, -0.0395625308156013, 0.8821132779121399, 0.0838626548647881, 0.0933722704648972, -0.3405517935752869, -0.6993819475173950, -0.2844510078430176, 0.0422397889196873, -0.0712213888764381, 0.4871867001056671, 0.1554806381464005));
	c1 += mul(bc1, float4x4(-0.2699566185474396, 0.0006965834181756, 0.2727784812450409, -0.0620054267346859, -0.3147658407688141, -0.0038526458665729, 0.3236006200313568, -0.0877058431506157, 0.2106281071901321, 0.0453009121119976, -0.5439859628677368, 0.0184146761894226, -0.0296773612499237, -0.0038715677801520, 0.1077574864029884, 0.0138774076476693));
	c1 += mul(bc2, float4x4(-0.0886235535144806, -0.0839715227484703, -0.1154380440711975, -0.0073053352534771, 0.0023921213578433, -0.0275820419192314, 0.0455723628401756, 0.0129091050475836, -0.0887900739908218, -0.0148513885214925, 0.0876928195357323, -0.0784894824028015, -0.0062762177549303, -0.0526834838092327, 0.0037576633039862, 0.0319633670151234));
	c1 += mul(tr1, float4x4(-0.0464110672473907, 0.0236346330493689, 0.0669583231210709, -0.0472575165331364, 0.0208626259118319, -0.0271043106913567, -0.0606599785387516, -0.0139442197978497, -0.0014815550530329, 0.0122182741761208, 0.0004507199628279, 0.0158279109746218, 0.0385761559009552, 0.0296174921095371, -0.0291853323578835, 0.0165338683873415));
	c1 += mul(tr2, float4x4(-0.0233153514564037, 0.0219305511564016, -0.0743728205561638, -0.0093567697331309, 0.0286262184381485, 0.0774424001574516, 0.1148394867777824, -0.0301702339202166, 0.0024869549088180, -0.0374004244804382, 0.4192572534084320, -0.0790813118219376, -0.0493366643786430, -0.0104152699932456, -0.0490021072328091, -0.0024766430724412));
	c1 += mul(mr1, float4x4(0.0144469039514661, -0.1078102141618729, 0.1059413179755211, -0.1089596152305603, 0.0470409952104092, 0.0500787831842899, -0.0514846295118332, 0.0626327991485596, -0.0882258489727974, 0.0321751609444618, 0.2398201376199722, -0.2443147897720337, 0.0259658545255661, -0.0396602302789688, 0.2174025326967239, 0.0715249925851822));
	c1 += mul(mr2, float4x4(-0.0154824554920197, 0.0291374288499355, 0.0039887567982078, -0.0431501194834709, 0.0593536123633385, -0.0845754146575928, 0.2000104188919067, 0.0186260938644409, 0.0547599084675312, 0.0526885949075222, -0.1233010515570641, -0.0432526804506779, 0.0357321202754974, 0.0021650493144989, 0.0814491733908653, 0.0971980616450310));
	c1 += mul(br1, float4x4(0.0287933535873890, 0.0494442507624626, 0.0210838094353676, -0.0483787320554256, -0.0759167149662971, -0.0253081526607275, 0.1219362914562225, -0.0418672412633896, -0.0189503412693739, 0.0143473483622074, -0.1790502667427063, -0.0662427768111229, 0.0324281528592110, 0.0148597611114383, 0.0266743116080761, 0.0377049185335636));
	c1 += mul(br2, float4x4(-0.0054656565189362, -0.0150196319445968, -0.0463149808347225, -0.0104772448539734, 0.0347928367555141, 0.0091987038031220, 0.0037015024572611, 0.0577751062810421, 0.0232732165604830, 0.0034828644711524, 0.0977631732821465, -0.0051266341470182, -0.0189268663525581, -0.0049852686934173, 0.0438088737428188, -0.0025434335693717));
	c1 = max(c1, 0) + float4(0.2930726408958435, -0.7832366824150085, 0.0082256151363254, 1.0583437681198120) * min(c1, 0);

	float4 c2 = { -0.0836324766278267,0.0299216359853745,0.0159619841724634,-0.1379968672990799 };
	c2 += mul(tl1, float4x4(-0.0837135538458824, 0.0536015741527081, -0.0739900916814804, 0.0180259179323912, 0.0306078922003508, 0.0638481751084328, -0.0674207285046577, -0.0308991391211748, -0.0134472101926804, -0.0455930270254612, 0.0009395828237757, -0.0776428431272507, -0.0028933393768966, -0.0612038075923920, -0.0162172410637140, -0.0327735245227814));
	c2 += mul(tl2, float4x4(-0.1481152623891830, 0.1160185635089874, -0.1463897079229355, 0.0038319902960211, -0.1815536171197891, 0.0099428771063685, -0.1647379845380783, 0.0335076004266739, -0.0302220620214939, -0.1180571690201759, -0.0232424903661013, -0.0164348836988211, -0.0015188503311947, 0.1787684559822083, 0.0413909815251827, 0.0699580833315849));
	c2 += mul(ml1, float4x4(0.0381213910877705, -1.8747351169586182, 0.0916024073958397, -0.1057635396718979, -0.1335459649562836, 0.0378836020827293, -0.0848037749528885, 0.1720509082078934, 0.1115766093134880, 0.0519676357507706, 0.1311796754598618, -0.2338305413722992, -0.0886595770716667, 0.1390771120786667, 0.0120587171986699, 0.0929709225893021));
	c2 += mul(ml2, float4x4(-0.0403309725224972, 0.1023108437657356, -0.0249778237193823, -0.2056589871644974, 0.0864044427871704, 0.7677633166313171, 0.6112527251243591, -0.0935023576021194, 0.0358289442956448, -0.0510838404297829, 0.0531301461160183, -0.1200713515281677, -0.0281702410429716, 0.3054289221763611, -0.1970508396625519, 0.1440129280090332));
	c2 += mul(bl1, float4x4(0.0137230604887009, 0.1833357512950897, -0.0056075016036630, -0.1050542071461678, 0.0367035493254662, 0.0896537080407143, -0.0109558179974556, -0.0221142154186964, -0.0462382063269615, -0.1151964291930199, -0.0042086942121387, 0.0297981910407543, 0.0043998458422720, 0.0687817037105560, -0.0601253211498260, 0.0031949516851455));
	c2 += mul(bl2, float4x4(0.0760864540934563, 0.1863034367561340, 0.0503818355500698, -0.0258647575974464, -0.0760487392544746, 0.2433954179286957, -0.1973436474800110, 0.0079258847981691, -0.0294476337730885, -0.0404389686882496, -0.0294238775968552, 0.0358795709908009, 0.0298653114587069, 0.0783578902482986, 0.0419599078595638, 0.0248970054090023));
	c2 += mul(tc1, float4x4(0.0907182395458221, 0.0576495565474033, 0.0530257523059845, 0.0549531430006027, 0.0915074944496155, -0.0465312339365482, -0.0230909585952759, -0.1178105399012566, -0.1995413154363632, -0.0433083362877369, -0.0418573357164860, 0.0866744294762611, -0.0793146342039108, 0.0074148247949779, 0.1731810569763184, 0.0715740397572517));
	c2 += mul(tc2, float4x4(0.0023458630312234, 0.0404974594712257, -0.1356777399778366, 0.0418198816478252, -0.0675975754857063, 0.0050714882090688, 0.1104314029216766, -0.1206769123673439, -0.1913090348243713, 0.0773992761969566, 0.0560133233666420, -0.2490582764148712, 0.0971352458000183, 0.0728188008069992, 0.0192280132323503, 0.0784228071570396));
	c2 += mul(mc1, float4x4(-0.0261659007519484, 0.1157309040427208, -0.3534074723720551, 0.5200188755989075, 0.1345363408327103, -0.1973183751106262, 0.1199645772576332, -1.1136766672134399, 0.1412540972232819, 0.1534357517957687, 0.2593606412410736, 0.3824510574340820, 0.5013928413391113, -0.1928857117891312, -0.2875523269176483, -0.0354673676192760));
	c2 += mul(mc2, float4x4(0.1194906458258629, 0.1256935596466064, 0.0932049900293350, 0.1464174836874008, 0.0670514181256294, -0.1400509625673294, 0.1003381684422493, -0.1169824004173279, 0.2452844530344009, -0.0348181650042534, -0.3607256710529327, -0.3338264226913452, -0.0456272326409817, -0.0939910858869553, -0.1304696053266525, 0.0402086712419987));
	c2 += mul(bc1, float4x4(-0.0869473740458488, -0.1211445480585098, 0.0065225088037550, 0.0068075512535870, -0.1304764598608017, -0.0536689385771751, -0.1490984708070755, -0.0136555638164282, 0.1024399474263191, 0.0315260104835033, 0.0369606800377369, -0.0167442485690117, 0.0501606240868568, -0.0476666353642941, 0.0291139576584101, -0.0997947081923485));
	c2 += mul(bc2, float4x4(-0.0373449698090553, -0.0382255539298058, -0.1446493864059448, 0.0054460307583213, 0.0428361445665359, -0.2411493360996246, 0.0281034875661135, -0.1959404051303864, -0.0147660700604320, -0.0115446811541915, -0.0435077212750912, -0.0445765219628811, 0.0025869212113321, -0.0454641655087471, -0.0552970357239246, 0.0636689588427544));
	c2 += mul(tr1, float4x4(0.0051982915028930, -0.0825250744819641, 0.0167135465890169, -0.1018612906336784, -0.0686964690685272, 0.0087886471301317, -0.0206265803426504, -0.0206072553992271, 0.0498707666993141, -0.0364030301570892, 0.0309015773236752, 0.0090340757742524, -0.0129618886858225, 0.0051486417651176, 0.0260841641575098, 0.0167939160019159));
	c2 += mul(tr2, float4x4(0.0010780893499032, -0.0352349840104580, -0.0192162413150072, -0.0381371527910233, 0.0282760411500931, -0.0559629201889038, 0.0619673281908035, -0.0514238551259041, 0.0122259482741356, 0.0608348101377487, -0.0754647627472878, -0.1400517821311951, -0.0136043848469853, 0.0550616234540939, -0.0093750739470124, 0.0393888689577579));
	c2 += mul(mr1, float4x4(-0.1028572022914886, -0.0257119275629520, -0.0871436968445778, -0.2802977561950684, 0.1279940754175186, 0.0694741085171700, 0.0366430617868900, 0.1782210469245911, -0.0931140556931496, -0.1249292492866516, -0.0775476619601250, -0.3643486201763153, -0.1307956129312515, -0.0392269045114517, -0.0212084632366896, 0.0024634231813252));
	c2 += mul(mr2, float4x4(-0.0036887160968035, -0.0489760562777519, -0.0022769547067583, -0.0936355590820312, -0.0981694832444191, -0.0402673967182636, -0.0764046013355255, 0.0067043504677713, 0.0426195561885834, -0.0056512621231377, 0.0814872384071350, -0.0765113532543182, 0.0027868365868926, 0.0623648613691330, 0.0598746836185455, 0.1226531565189362));
	c2 += mul(br1, float4x4(0.0350537523627281, 0.0229270569980145, 0.0465267412364483, -0.0452729463577271, 0.0256041418761015, 0.0215708781033754, -0.0193957649171352, -0.0173931997269392, 0.0231500957161188, -0.0499401167035103, 0.0173991154879332, -0.0804103761911392, -0.0232445765286684, -0.0107213268056512, 0.0450597628951073, 0.0691299363970757));
	c2 += mul(br2, float4x4(0.0023379696067423, -0.0239015202969313, -0.0137971211224794, -0.0529763884842396, 0.0394022278487682, 0.0184435173869133, 0.0855478867888451, 0.0777183994650841, -0.0052624838426709, -0.0031146518886089, 0.0387042500078678, 0.0015482418239117, -0.0088084554299712, 0.0233839545398951, -0.0151950558647513, 0.0070519605651498));
	c2 = max(c2, 0) + float4(0.7931001186370850, -0.0506631620228291, 0.3833878636360168, 0.3061273992061615) * min(c2, 0);

	float4 target1 = float4(0.0313877351582050, -0.0325053185224533, -0.0413495972752571, 0.0357267409563065);
	target1 += mul(c1, float4x4(0.1461677402257919, -0.1208343803882599, 0.0125428512692451, -0.1319324076175690, -0.3387282788753510, -0.3393035829067230, -0.2043240815401077, 0.0048940703272820, -0.0343158058822155, 0.0995195582509041, -0.0571033284068108, -0.0356303341686726, 0.1097832918167114, 0.1462953090667725, 0.0575199872255325, -0.1682354062795639));
	target1 += mul(c2, float4x4(0.1110563054680824, 0.0068465564399958, -0.0098551185801625, -0.1380221396684647, -0.3268660008907318, -0.2625139057636261, 0.5479852557182312, -0.0188483651727438, -0.7557058334350586, -0.6716431379318237, 0.0680231377482414, 1.0786534547805786, -0.0519768036901951, -0.0483648441731930, 0.1052823588252068, 0.0570317767560482));
	target1 += featureMap1.SampleLevel(sam, pos, 0);
	target1 = max(target1, 0) + float4(0.9962985515594482, 0.9851159453392029, 0.2272046357393265, -0.1116774082183838) * min(target1, 0);

	float4 target2 = float4(-0.0820835754275322, -0.0049459170550108, -0.1635029017925262, 0.0367167443037033);
	target2 += mul(c1, float4x4(-0.1730685681104660, -0.1758024245500565, -0.0673282966017723, -0.7883995175361633, -0.0092403469607234, 0.1237237676978111, -0.0802010595798492, 0.1532886922359467, -0.0256615914404392, 0.0783618539571762, -0.0216845069080591, 0.4360575079917908, 0.2488089799880981, -0.0421040952205658, 0.4070311486721039, 0.1112201139330864));
	target2 += mul(c2, float4x4(-0.1360913068056107, -0.0307455379515886, -0.1545475125312805, -0.0465389303863049, -0.1308580189943314, 0.2625028789043427, -0.2567890584468842, 0.3363034725189209, 0.1148972064256668, 0.3114618360996246, -0.1600875705480576, 0.6157666444778442, 0.0422471873462200, 0.0509155690670013, -1.1255714893341064, 0.0219085998833179));
	target2 += featureMap2.SampleLevel(sam, pos, 0);
	target2 = max(target2, 0) + float4(-0.3776825070381165, 1.2568452358245850, 0.3147132694721222, 1.0953333377838135) * min(target2, 0);

	tex3[gxy] = target1;
	tex4[gxy] = target2;
}


//!PASS 6
//!DESC sub-pixel convolution, aggregation
//!IN tex3, tex4, INPUT
//!OUT OUTPUT
//!BLOCK_SIZE 16
//!NUM_THREADS 64

const static float2x3 rgb2uv = {
	-0.169, -0.331, 0.5,
	0.5, -0.419, -0.081
};

const static float3x3 yuv2rgb = {
	1, -0.00093, 1.401687,
	1, -0.3437, -0.71417,
	1, 1.77216, 0.00099
};

void Pass6(uint2 blockStart, uint3 threadId) {
	uint2 gxy = (Rmp8x8(threadId.x) << 1) + blockStart;

	const uint2 outputSize = GetOutputSize();
	if (gxy.x >= outputSize.x || gxy.y >= outputSize.y) {
		return;
	}

	float2 inputPt = GetInputPt();
	float2 outputPt = GetOutputPt();

	float2 pos = ((gxy >> 1) + 0.5f) * inputPt;

	// [tl, tc, tr]
	// [ml, mc, mr]
	// [bl, bc, br]
	float4 tl1 = tex3.SampleLevel(sam, pos - inputPt, 0);
	float4 ml1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl1 = tex3.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc1 = tex3.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc1 = tex3.SampleLevel(sam, pos, 0);
	float4 bc1 = tex3.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr1 = tex3.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br1 = tex3.SampleLevel(sam, pos + inputPt, 0);

	float4 tl2 = tex4.SampleLevel(sam, pos - inputPt, 0);
	float4 ml2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, 0), 0);
	float4 bl2 = tex4.SampleLevel(sam, pos + float2(-inputPt.x, inputPt.y), 0);
	float4 tc2 = tex4.SampleLevel(sam, pos + float2(0, -inputPt.y), 0);
	float4 mc2 = tex4.SampleLevel(sam, pos, 0);
	float4 bc2 = tex4.SampleLevel(sam, pos + float2(0, inputPt.y), 0);
	float4 tr2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, -inputPt.y), 0);
	float4 mr2 = tex4.SampleLevel(sam, pos + float2(inputPt.x, 0), 0);
	float4 br2 = tex4.SampleLevel(sam, pos + inputPt, 0);

	float4 result = { 0.2160381823778152,0.2298326790332794,0.2062894254922867,0.2233859002590179 };
	result += mul(tl1, float4x4(-0.0031023439951241, 0.0059619112871587, 0.0058020050637424, 0.0062482208013535, 0.0052765505388379, -0.0022552218288183, -0.0065842187032104, -0.0008604917675257, 0.0003460258303676, -0.0022195784840733, -0.0074996319599450, -0.0015739878872409, 0.0056171459145844, 0.0002592361997813, 0.0019835520070046, 0.0018105609342456));
	result += mul(tl2, float4x4(0.0275507327169180, 0.0051669259555638, -0.0139658711850643, 0.0030883529689163, 0.0089544747024775, 0.0085759535431862, -0.0002981633588206, -0.0054096584208310, -0.0125233745202422, 0.0065056309103966, 0.0073427790775895, 0.0003864165919367, -0.0041021117940545, 0.0030372787732631, 0.0006185144884512, 0.0062267151661217));
	result += mul(ml1, float4x4(0.0105074141174555, -0.0004502697847784, -0.0009122507181019, 0.0026426143012941, 0.0040101194754243, -0.0023395312018692, 0.0017362632788718, 0.0091658774763346, 0.0205013062804937, -0.0020239499863237, -0.0114130824804306, -0.0075353132560849, 0.0225470513105392, -0.0366680622100830, -0.0000535918843525, 0.0050146644935012));
	result += mul(ml2, float4x4(0.0625989288091660, 0.1151928976178169, -0.0027104590553790, -0.0123298475518823, -0.0099824573844671, 0.0108926137909293, 0.0334197729825974, 0.0193741749972105, 0.0490560680627823, 0.0152345551177859, 0.0405332818627357, 0.0399475656449795, 0.0105373272672296, -0.0020778691396117, -0.0097913276404142, 0.0004653275245801));
	result += mul(bl1, float4x4(-0.0080901645123959, -0.0073888520710170, -0.0015905323671177, 0.0006022278103046, -0.0082205599173903, -0.0010853469138965, 0.0055166282691061, 0.0084898881614208, 0.0071090045385063, 0.0003273489419371, -0.0063129402697086, -0.0121429329738021, -0.0069875395856798, -0.0045646210201085, 0.0015905641485006, 0.0037552448920906));
	result += mul(bl2, float4x4(-0.0046724923886359, 0.0016644159331918, 0.0062663913704455, 0.0014710600953549, -0.0338839665055275, -0.0083738788962364, 0.0060393707826734, 0.0241303537040949, -0.0034021178726107, 0.0056042689830065, -0.0083823651075363, 0.0026392592117190, 0.0026657863054425, 0.0020144139416516, -0.0042312326841056, -0.0035609540063888));
	result += mul(tc1, float4x4(0.0009882192825899, 0.0002152582601411, -0.0123004913330078, -0.0005484037101269, -0.0111524788662791, 0.0119459852576256, -0.0154546750709414, 0.0188625976443291, 0.0204733721911907, -0.0079483250156045, -0.0007576426141895, -0.0042714662849903, 0.0307108033448458, -0.0069440440274775, 0.0000003838358680, 0.0070015545934439));
	result += mul(tc2, float4x4(-0.0063984976150095, -0.0027606852818280, -0.0355033427476883, 0.0163108259439468, -0.0317453853785992, 0.0353556163609028, -0.0016268522012979, 0.0312290452420712, 0.0149499354884028, -0.0139254443347454, 0.0109228380024433, 0.0234404578804970, 0.0088780215010047, 0.0083913588896394, 0.0070422240532935, 0.0086190626025200));
	result += mul(mc1, float4x4(0.4668360948562622, 0.5502970218658447, 0.5616708993911743, 0.4827409684658051, -0.4825374484062195, -0.4473012387752533, -0.3305214643478394, -0.5241096019744873, -0.1466400325298309, 0.0804405659437180, 0.0349484048783779, 0.0052239256910980, -0.1175492331385612, 0.0954673886299133, 0.1326161473989487, -0.1377900093793869));
	result += mul(mc2, float4x4(0.0407393611967564, 0.0987918972969055, -0.0807525441050529, -0.1235820129513741, -0.0658119991421700, 0.0621846020221710, -0.0816140249371529, 0.0406704805791378, 0.3287288248538971, 0.3512411415576935, -0.3979234397411346, -0.3275865316390991, 0.2961717247962952, 0.0169802401214838, 0.2695185840129852, 0.0150923738256097));
	result += mul(bc1, float4x4(-0.0050105354748666, -0.0008035619393922, -0.0079824347048998, 0.0376648232340813, -0.0048509594053030, -0.0067090289667249, -0.0259696990251541, 0.0086074192076921, 0.0126011213287711, -0.0174850821495056, 0.0156697910279036, 0.0028463243506849, 0.0099161649122834, -0.0114374589174986, -0.0169337950646877, 0.0257211104035378));
	result += mul(bc2, float4x4(0.0043578865006566, -0.0106186466291547, 0.0165964700281620, 0.0014091627672315, 0.0037815889809281, 0.0364324115216732, -0.0395258180797100, 0.0625703483819962, -0.0144113656133413, 0.0321297869086266, 0.0208382532000542, -0.0383783876895905, -0.0258011315017939, -0.0143258580937982, -0.0092717679217458, -0.0178859047591686));
	result += mul(tr1, float4x4(-0.0050332350656390, -0.0001488960406277, -0.0115145305171609, 0.0045622875913978, -0.0021374952048063, 0.0015796425286680, -0.0077072884887457, 0.0015828146133572, 0.0171351470053196, 0.0024297721683979, -0.0374940223991871, 0.0058337682858109, -0.0027352231554687, 0.0007051698048599, -0.0002820930676535, -0.0036236173473299));
	result += mul(tr2, float4x4(0.0016052227001637, 0.0049667325802147, -0.0081164520233870, -0.0035548578016460, -0.0131295239552855, 0.0158293209969997, -0.0340857952833176, 0.0222462061792612, -0.0019779701251537, -0.0012918257853016, 0.0048964750021696, 0.0049310824833810, 0.0011332486756146, -0.0004788591759279, -0.0017202866729349, 0.0012215448077768));
	result += mul(mr1, float4x4(-0.0057867411524057, -0.0133802210912108, 0.0158019792288542, 0.0344623439013958, -0.0040443707257509, -0.0037790425121784, 0.0045541841536760, 0.0198037009686232, 0.1110823750495911, 0.0822209641337395, -0.0382958762347698, -0.1459431499242783, 0.0010219293180853, -0.0161874033510685, -0.0285891294479370, 0.0300961043685675));
	result += mul(mr2, float4x4(-0.0089390166103840, -0.0105021111667156, -0.0130960196256638, -0.0257005076855421, 0.0099271116778255, -0.0267942640930414, 0.0677764937281609, 0.0239320658147335, -0.0469516664743423, -0.0230019800364971, -0.0446235798299313, -0.0338921397924423, -0.0061601125635207, 0.0139691382646561, -0.0029954034835100, -0.0108477231115103));
	result += mul(br1, float4x4(-0.0002513871586416, 0.0017334159929305, -0.0072467559948564, -0.0149013847112656, -0.0091025009751320, -0.0054195052944124, -0.0053499941714108, 0.0118143679574132, -0.0134334927424788, 0.0106377983465791, -0.0072384304367006, 0.0483081750571728, -0.0011459409724921, -0.0010465533705428, 0.0040362793952227, -0.0004637696838472));
	result += mul(br2, float4x4(0.0037248437292874, 0.0041816406883299, -0.0139264371246099, 0.0035959482192993, -0.0390684641897678, -0.0257617402821779, -0.0298785082995892, -0.0066674682311714, -0.0151000469923019, -0.0176323894411325, 0.0045121158473194, 0.0077007445506752, 0.0051746177487075, 0.0075236861594021, 0.0008582049049437, 0.0010294843232259));

	[unroll]
	for (uint i = 0; i <= 1; ++i) {
		[unroll]
		for (uint j = 0; j <= 1; ++j) {
			const uint2 destPos = gxy + uint2(i, j);

			float2 originUV = mul(rgb2uv, INPUT.SampleLevel(sam1, (destPos + 0.5f) * outputPt, 0).rgb);
			const uint index = i * 2 + j;
			OUTPUT[destPos] = float4(mul(yuv2rgb, float3(result[index], originUV)), 1);
		}
	}
}
